In [ ]:
import catboost
import pandas as pd
import numpy as np
import plotly_express as px
from scipy import stats

trainset = pd.read_parquet("data/numerai_training_data.parquet")
validset = pd.read_parquet("data/numerai_validation_data.parquet")
feature_names = [f for f in trainset.columns if "feature_" in f]

params = {
    "iterations":1000,
    "learning_rate":0.01,
    "depth":6,
    "task_type":'GPU',
    "verbose":False,
}

model = catboost.CatBoostRegressor(**params)
model.fit(trainset[feature_names], trainset["target"])
validset["base_preds"] = model.predict(validset[feature_names])
In [ ]:
# this is Numerai's code for feature neutralization:
# https://forum.numer.ai/t/model-diagnostics-feature-exposure/899

def get_biggest_change_features(corrs, n):
    all_eras = corrs.index.sort_values()
    h1_eras = all_eras[:len(all_eras) // 2]
    h2_eras = all_eras[len(all_eras) // 2:]

    h1_corr_means = corrs.loc[h1_eras, :].mean()
    h2_corr_means = corrs.loc[h2_eras, :].mean()

    corr_diffs = h2_corr_means - h1_corr_means
    worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
    return worst_n
    
def neutralize(df,
               columns,
               neutralizers=None,
               proportion=1.0,
               normalize=True,
               era_col="era"):
    if neutralizers is None:
        neutralizers = []
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        df_era = df[df[era_col] == u]
        scores = df_era[columns].values
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (stats.rankdata(x, method='ordinal') - .5) / len(x)
                x = stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2).T
        exposures = df_era[neutralizers].values

        scores -= proportion * exposures.dot(
            np.linalg.pinv(exposures.astype(np.float32)).dot(scores.astype(np.float32)))

        scores /= scores.std(ddof=0)

        computed.append(scores)

    return pd.DataFrame(np.concatenate(computed),
                        columns=columns,
                        index=df.index)
In [ ]:
neutralization_proportions = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
n_riskiests = [20, 50, 150, 300, 500, 700, 900, len(feature_names)]

all_feature_corrs = validset.groupby("era").apply(
    lambda era: era[feature_names].corrwith(era["base_preds"])
)

y_corr_outputs = []
y_sharpe_outputs = []
for i, neutralization_proportion in enumerate(neutralization_proportions):
    x_corr_outputs = []
    x_sharpe_outputs = []
    for j, n_riskiest in enumerate(n_riskiests):
        riskiest_features = get_biggest_change_features(all_feature_corrs, n_riskiest)
        validset["modified_preds"] = neutralize(
            validset,
            ["base_preds"],
            neutralizers=riskiest_features,
            proportion=neutralization_proportion,
            normalize=True,
        )

        era_wise_correlations = validset.groupby("era").apply(
            lambda era: np.corrcoef(era["modified_preds"], era["target"])[0, 1]
        )
        mean_corr = era_wise_correlations.mean()
        # calculate sharpe ratio
        sharpe_ratio = (mean_corr) / era_wise_correlations.std()

        x_corr_outputs.append(mean_corr)
        x_sharpe_outputs.append(sharpe_ratio)

    y_corr_outputs.append(x_corr_outputs)
    y_sharpe_outputs.append(x_sharpe_outputs)
        
In [ ]:
corr_outputs = np.array(y_corr_outputs)
sharpe_outputs = np.array(y_sharpe_outputs)

# heatmap
# aspect ratio square
# display values inside each cell
fig = px.imshow(corr_outputs,
                x=n_riskiests,
                y=neutralization_proportions,
                color_continuous_scale=px.colors.sequential.Plasma,
                title="Correlation with Targets",
                labels={"x": "N Risky Features", "y": "Neutralization Proportion"},
                width=800,
                height=800,
                aspect=1,
)
fig.show(renderer = "notebook")
In [ ]:
fig = px.imshow(sharpe_outputs,
                x=n_riskiests,
                y=neutralization_proportions,
                color_continuous_scale=px.colors.sequential.Plasma,
                title="Sharpe Ratios",
                labels={"x": "N Risky Features", "y": "Neutralization Proportion"},
                width=800,
                height=800,
                aspect=1,
)
fig.show(renderer = "notebook")